1 Initial setup

1.1 Setup of the environment:

devtools::install_github("sdam-au/sdam") # loading SDAM custom package, if not working try devtools::install_github("mplex/cedhar", subdir="pkg/sdam")
#devtools::install_github("mplex/cedhar", subdir="pkg/sdam")
library(tidyverse)
library(sdam)
library(jsonlite)
library(leaflet)
library(tidytext)

1.2 Loading data

  1. Load the dataset, if you have Sciencedata.dk credentials
resp = request("EDH_text_cleaned_2020-10-09.json", path="/sharingin/648597@au.dk/SDAM_root/SDAM_data/EDH/public", method="GET", cred=mycred_secret)
  1. Make a list and tibble from the request function
list_json <- jsonlite::fromJSON(resp)
EDH_tibble <- as_tibble(list_json)
  1. Display the first 6 records
head(EDH_tibble)

2 Tidy text analysis of the clean_text_interpretive_word column

2.1 Tokenizing words, splitting on an empty space

EDH_tokenized <- EDH_tibble %>% 
  unnest_tokens(word, clean_text_interpretive_word, token = stringr::str_split, pattern = " ") %>% 
  drop_na(word) %>%
  print()
## # A tibble: 1,136,004 x 74
##    responsible_ind… type_of_inscrip… letter_size not_after literature
##    <chr>            <chr>            <chr>       <chr>     <chr>     
##  1 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  2 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  3 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  4 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  5 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  6 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  7 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  8 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  9 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
## 10 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
## # … with 1,135,994 more rows, and 69 more variables: work_status <chr>,
## #   height <chr>, diplomatic_text <chr>, people <list>, depth <chr>,
## #   material <chr>, type_of_monument <chr>, province_label <chr>, width <chr>,
## #   transcription <chr>, country <chr>, uri <chr>, findspot_ancient <chr>,
## #   last_update <chr>, modern_region <chr>, findspot_modern <chr>,
## #   language <chr>, id <chr>, edh_geography_uri <chr>, commentary <chr>,
## #   trismegistos_uri <chr>, not_before <chr>, external_image_uris <list>,
## #   fotos <list>, coordinates <list>, idno_tm <chr>, placenames_refs <list>,
## #   text_edition <chr>, origdate_text <chr>, layout_execution <chr>,
## #   layout_execution_text <chr>, support_objecttype <chr>,
## #   support_objecttype_text <chr>, support_material <chr>,
## #   support_material_text <chr>, support_decoration <chr>, keywords_term <chr>,
## #   keywords_term_text <chr>, type_of_inscription_clean <chr>,
## #   type_of_inscription_certainty <chr>, height_cm <dbl>, width_cm <dbl>,
## #   depth_cm <dbl>, material_clean <chr>, type_of_monument_clean <chr>,
## #   type_of_monument_certainty <chr>, province_label_clean <chr>,
## #   province_label_certainty <chr>, country_clean <chr>,
## #   country_certainty <chr>, findspot_ancient_clean <chr>,
## #   findspot_ancient_certainty <chr>, modern_region_clean <chr>,
## #   modern_region_certainty <chr>, findspot_modern_clean <chr>,
## #   findspot_modern_certainty <chr>, findspot_clean <chr>,
## #   findspot_certainty <chr>, origdate_text_clean <chr>,
## #   clean_text_conservative <chr>, clean_text_interpretive_sentence <chr>,
## #   findspot <chr>, year_of_find <chr>, present_location <chr>, religion <chr>,
## #   geography <chr>, social_economic_legal_history <chr>, military <chr>,
## #   word <chr>

2.2 Counting the most common words

EDH_tokenized %>% count(word, sort = TRUE) %>% filter(n > 5000) %>% mutate(word = reorder(word, 
    n)) %>% print()
## # A tibble: 22 x 2
##    word        n
##    <fct>   <int>
##  1 et      36364
##  2 dis     12165
##  3 manibus 12024
##  4 vixit    9544
##  5 in       8781
##  6 annos    7947
##  7 filius   7536
##  8 annorum  7051
##  9 est      6880
## 10 i        6827
## # … with 12 more rows

2.3 Number of total words on inscriptions per Roman province

EDH_tokenized %>% 
  count(province_label_clean, word, sort = TRUE) %>% 
  group_by(province_label_clean) %>% 
  summarise(total = sum(n)) %>% 
  mutate(province_label_clean = reorder(province_label_clean, total)) -> words_total_province
## `summarise()` ungrouping output (override with `.groups` argument)
head(words_total_province)
words_total_province %>% 
  ggplot(aes(total, province_label_clean)) +
  geom_col(fill = "darkblue", width = 0.7) +
  theme_classic() +
  labs(x = "Number of words", y = "Province name", title = "Number of total words on inscriptions per Roman province", subtitle = "EDH dataset, n = 81,476 inscriptions") +
  theme_linedraw(base_size = 10) 

3 The most common words by Roman provinces

EDH_tokenized %>% 
  count(province_label_clean, word, sort = TRUE) %>% 
  group_by(province_label_clean) %>% 
  filter(n > 1000) %>%
  mutate(province_label_clean = reorder(province_label_clean, n)) %>% 
  ggplot(aes(y=province_label_clean, x=n)) +
  geom_col(aes(fill=word), width=0.7) +
  labs(x = "Number of words", y = "Province name", title = "The most common words on inscriptions per Roman province", subtitle = "EDH dataset, n = 81,476 inscriptions") +
  theme_linedraw(base_size = 10) 

4 The most common words by type of an inscription (epitaph)

EDH_tokenized %>% 
  count(type_of_inscription_clean, word, sort = TRUE) %>% 
  group_by(type_of_inscription_clean) %>% 
  filter(type_of_inscription_clean == "epitaph") %>% 
  filter(n > 1000) %>% 
  mutate(word = reorder(word, n)) -> words_epitaph

total_words_epitaph<- sum(words_epitaph$n)

words_epitaph %>% 
  ggplot(aes(y=word, x=n, color=n)) +
  geom_col(width=0.7) + 
  scale_color_gradient(low="blue", high="red") + 
  theme_minimal() +
  theme_linedraw(base_size = 9) +
  labs(x = "Number of words", y = "Word", title = "The most common words on epitaphs", subtitle = "n = 123,039 words")

5 The most common words by type of an inscription (milestone)

EDH_tokenized %>% 
  count(type_of_inscription_clean, word, sort = TRUE) %>% 
  group_by(type_of_inscription_clean) %>% 
  filter(type_of_inscription_clean == "mile-/leaguestone") %>% 
  filter(n > 100) %>% 
  mutate(word = reorder(word, n)) -> words_milestone
words_milestone
total_words_milestone <- sum(words_milestone$n)

words_milestone %>% 
  ggplot(aes(y=word, x=n, color=n)) +
  geom_col(width=0.6) + 
  scale_color_gradient(low="blue", high="red") + 
  theme_minimal() +
  theme_linedraw(base_size = 9) +
  labs(x = "Number of words", y = "Word", title = "The most common words on milestones", subtitle = "n = 24,986 words")

ggsave(filename = "../figures/EDH_milestone_common_words.png", width = 8, height = 8)

6 The most common words on milestones per province

EDH_tokenized %>% 
  filter(type_of_inscription_clean == "mile-/leaguestone") %>% 
  count(province_label_clean, word, sort = TRUE) %>% 
  group_by(province_label_clean) %>% 
  filter(n > 50) %>%
  mutate(province_label_clean = reorder(province_label_clean, n)) %>% 
  ggplot(aes(y=province_label_clean, x=n)) +
  geom_col(aes(fill=word), width=0.7) +
  labs(x = "Number of words", y = "Province name", title = "The most common words on milestones per Roman province", subtitle = "EDH dataset, n = 81,476 inscriptions") +
  theme_linedraw(base_size = 10) 

library(wordcloud)
## Loading required package: RColorBrewer
EDH_tokenized %>% 
  filter(type_of_inscription_clean == "mile-/leaguestone") %>% 
  count(province_label_clean, word, sort = TRUE) %>% 
  group_by(province_label_clean) %>% 
  filter(n > 50) %>%
  mutate(province_label_clean = reorder(province_label_clean, n)) %>% 
  with(wordcloud(word, n, max.words = 200))

# Frequency of words Source: https://www.tidytextmining.com/tfidf.html

insc_types_words <- EDH_tokenized %>% 
  count(type_of_inscription_clean, word, sort = TRUE)

total_words <- insc_types_words %>% 
  group_by(type_of_inscription_clean) %>% 
  summarize(total = sum(n))
## `summarise()` ungrouping output (override with `.groups` argument)
insc_types_words <- left_join(insc_types_words, total_words)
## Joining, by = "type_of_inscription_clean"
insc_types_words
ggplot(insc_types_words, aes(n/total, fill = type_of_inscription_clean)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~type_of_inscription_clean, ncol = 4, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4509 rows containing non-finite values (stat_bin).
## Warning: Removed 18 rows containing missing values (geom_bar).

ggsave(filename = "../figures/EDH_freq_words_insc_type.png", width = 8, height = 8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4509 rows containing non-finite values (stat_bin).

## Warning: Removed 18 rows containing missing values (geom_bar).

6.1 Rank of words (Zip’s law)

freq_by_rank <- insc_types_words %>% 
  group_by(type_of_inscription_clean) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = type_of_inscription_clean)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) + 
  scale_x_log10() +
  scale_y_log10()

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -0.9723      -0.9194
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = type_of_inscription_clean)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  scale_x_log10() +
  scale_y_log10()

Commentary: EDH corpus uses a lower percentage of the most common words than many collections of language.

6.2 Term frequency vs inverse document frequency

insc_types_words <- insc_types_words %>%
  bind_tf_idf(word, type_of_inscription_clean, n)

insc_types_words
insc_types_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
insc_types_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(type_of_inscription_clean) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = type_of_inscription_clean)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~type_of_inscription_clean, ncol = 4, scales = "free_y") +
  coord_flip() +
  labs(x = "word", y = "tf-idf", title = "Term frequency - inverse document frequency (tf-idf) by type of inscription", subtitle = "EDH dataset, n = 81,476 inscriptions") +
  theme_linedraw(base_size = 10)
## Selecting by tf_idf

ggsave("../figures/EDH_tf_idf_insc_type.png", width = 16, height = 16)